# registry directory and cpos_total will be needed in examples
cpos_total <- cl_attribute_size(
corpus = "REUTERS", attribute = "word",
attribute_type = "p", registry = get_tmp_registry()
)
# decode the token stream of the corpus (the quick way)
token_stream_str <- cl_cpos2str(
corpus = "REUTERS", p_attribute = "word",
cpos = seq.int(from = 0, to = cpos_total - 1),
registry = get_tmp_registry()
)
# decode the token stream (cpos2id first, then id2str)
token_stream_ids <- cl_cpos2id(
corpus = "REUTERS", p_attribute = "word",
cpos = seq.int(from = 0, to = cpos_total - 1),
registry = get_tmp_registry()
)
token_stream_str <- cl_id2str(
corpus = "REUTERS", p_attribute = "word",
id = token_stream_ids, registry = get_tmp_registry()
)
# get corpus positions of a token
token_to_get <- "oil"
id_oil <- cl_str2id(
corpus = "REUTERS", p_attribute = "word",
str = token_to_get, registry = get_tmp_registry()
)
cpos_oil <- cl_id2cpos <- cl_id2cpos(
corpus = "REUTERS", p_attribute = "word",
id = id_oil, registry = get_tmp_registry()
)
# get frequency of token
oil_freq <- cl_id2freq(
corpus = "REUTERS", p_attribute = "word", id = id_oil, registry = get_tmp_registry()
)
length(cpos_oil) # needs to be the same as oil_freq
# use regular expressions
ids <- cl_regex2id(
corpus = "REUTERS", p_attribute = "word",
regex = "M.*", registry = get_tmp_registry()
)
m_words <- cl_id2str(
corpus = "REUTERS", p_attribute = "word",
id = ids, registry = get_tmp_registry()
)
Run the code above in your browser using DataLab